lsm_tree/config/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5mod block_size;
6mod compression;
7mod delete_strategy;
8mod filter;
9mod hash_ratio;
10mod locator;
11mod pinning;
12mod restart_interval;
13
14pub use block_size::BlockSizePolicy;
15pub use compression::CompressionPolicy;
16pub use delete_strategy::{DeleteStrategy, DeleteStrategyPolicy};
17pub use filter::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry};
18pub use hash_ratio::HashRatioPolicy;
19pub use locator::{LocatorPolicy, LocatorPolicyEntry, LocatorPrecision};
20pub use pinning::PinningPolicy;
21pub use restart_interval::RestartIntervalPolicy;
22
23/// Partitioning policy for indexes and filters
24pub type PartitioningPolicy = PinningPolicy;
25
26#[cfg(feature = "std")]
27use crate::fs::StdFs;
28use crate::path::PathBuf;
29use crate::{
30    AnyTree, BlobTree, Cache, CompressionType, DescriptorTable, SharedSequenceNumberGenerator,
31    Tree,
32    compaction::filter::Factory,
33    comparator::SharedComparator,
34    encryption::EncryptionProvider,
35    file::TABLES_FOLDER,
36    fs::{Fs, SyncMode},
37    merge_operator::MergeOperator,
38    path::absolute_path,
39    prefix::PrefixExtractor,
40};
41// std-only: used solely by the std-gated `Config::default` / `Config::new`
42// constructors (the no_std path builds `Config` field-by-field).
43#[cfg(feature = "std")]
44use crate::{SequenceNumberCounter, comparator, path::Path, version::DEFAULT_LEVEL_COUNT};
45use alloc::sync::Arc;
46#[cfg(not(feature = "std"))]
47use alloc::vec::Vec;
48use core::ops::Range;
49
50/// Per-level filesystem routing entry for tiered storage.
51///
52/// Maps a range of LSM levels to a base directory and filesystem backend.
53/// Tables at these levels are stored under `path/tables/`.
54///
55/// # Example
56///
57/// ```
58/// use lsm_tree::config::LevelRoute;
59/// use lsm_tree::fs::StdFs;
60/// use std::sync::Arc;
61///
62/// // Hot tier: L0-L1 on NVMe
63/// let hot = LevelRoute {
64///     levels: 0..2,
65///     path: "/mnt/nvme/db".into(),
66///     fs: Arc::new(StdFs),
67/// };
68///
69/// // Cold tier: L4-L6 on HDD
70/// let cold = LevelRoute {
71///     levels: 4..7,
72///     path: "/mnt/hdd/db".into(),
73///     fs: Arc::new(StdFs),
74/// };
75/// ```
76#[derive(Clone)]
77pub struct LevelRoute {
78    /// LSM levels this route covers (e.g., `0..2` for L0–L1).
79    pub levels: Range<u8>,
80
81    /// Base data directory for tables at these levels.
82    pub path: PathBuf,
83
84    /// Filesystem backend for I/O at these levels.
85    pub fs: Arc<dyn Fs>,
86}
87
88impl core::fmt::Debug for LevelRoute {
89    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
90        f.debug_struct("LevelRoute")
91            .field("levels", &self.levels)
92            .field("path", &self.path)
93            .finish_non_exhaustive()
94    }
95}
96
97/// Policy governing what `Tree::open` does when the on-disk MANIFEST
98/// contains corrupt records.
99///
100/// Mirrors `RocksDB`'s `WALRecoveryMode` semantics, but applied to the
101/// manifest layer (`src/version/recovery.rs`) — lsm-tree itself has no
102/// WAL (durability lives one layer up in the parent fjall/keyspace
103/// crate's `Journal`). The MANIFEST is the equivalent surface where
104/// "loss-tolerance vs strict-consistency" matters at open time.
105///
106/// The default is [`AbsoluteConsistency`](Self::AbsoluteConsistency) —
107/// any corrupt record fails the open. Switching to a more permissive
108/// mode is an explicit, informed operator decision: you are trading
109/// "the tree might silently come up with missing tables / blob files"
110/// for "the tree comes up at all". When a non-default mode drops
111/// records, the recovery path emits a `warn!` summary with the
112/// AGGREGATE dropped count per section (`tables` / `blob_files`) —
113/// individual table IDs / blob-file IDs are NOT enumerated, because
114/// they were never decoded in the first place. Operators wanting a
115/// per-record audit trail should pair tail-tolerant recovery with an
116/// out-of-band integrity scan ([`verify_integrity`](crate::verify::verify_integrity))
117/// of the recovered tree.
118#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
119pub enum ManifestRecoveryMode {
120    /// Production-safe default. Any per-record decode mismatch (bad
121    /// XXH3, invalid tag, truncated TOC entry, declared-count overrun)
122    /// aborts the open with the original error. Surfaces every byte
123    /// of corruption; never silently drops data.
124    #[default]
125    AbsoluteConsistency,
126
127    /// Power-loss-at-write-tail salvage. If the per-section iteration
128    /// over the `tables` / `blob_files` records runs out of bytes
129    /// before the declared count is reached (truncated tail), keep
130    /// everything that decoded cleanly before the cut and emit a
131    /// `warn!` listing the dropped record counts.
132    ///
133    /// A declared count that exceeds the section's payload capacity
134    /// (e.g. `table_count` claims more entries than the section has
135    /// bytes for) is treated as the same "writer committed a count
136    /// header then truncated the entries" shape — the recovery
137    /// downgrades the original hard fail to a `warn!` and lets the
138    /// per-entry decode loop walk bytes-actually-present until the
139    /// first `UnexpectedEof`.
140    ///
141    /// Any decode error that is NOT a clean tail truncation (bad
142    /// `checksum_type` tag, etc.) still aborts the open — this mode
143    /// is specifically for "the writer never finished" scenarios,
144    /// not for arbitrary bit-rot in already-committed bytes.
145    TolerateCorruptedTailRecords,
146
147    /// Recover the largest consistent prefix and discard the rest.
148    /// Adapts `RocksDB`'s `kPointInTimeRecovery` accept-the-prefix
149    /// rule to the level/run/table nesting: on the first
150    /// record-decode mismatch inside the `tables` section, the
151    /// recovery keeps the records that decoded cleanly *before*
152    /// the corrupt one in the current run, plus every complete
153    /// earlier run in the same level, plus every complete earlier
154    /// level. "Record-decode mismatch" covers ALL three failure
155    /// shapes the per-record loop can surface:
156    ///
157    /// 1. Framing-layer XXH3 mismatch (the 8-byte digest in the
158    ///    record header doesn't match `xxh3_64(payload)`).
159    /// 2. Framing-header structural failure (`len > MAX_FRAME_PAYLOAD`),
160    ///    surfaced as `BadHeader`. Note: `LenMismatch` (decoded `len`
161    ///    disagrees with a fixed-length pin) is a SEPARATE hard-abort
162    ///    case in every recovery mode, not a record-decode mismatch
163    ///    for the purpose of this mode.
164    /// 3. Payload decode failure AFTER a clean framing pass —
165    ///    e.g. `Error::InvalidTag` from a corrupt `checksum_type`
166    ///    byte inside an otherwise-framed-OK record. The framing
167    ///    XXH3 happens to cover the corrupt byte too (it's a
168    ///    digest of the whole payload), so the bytes decode
169    ///    cleanly at the framing layer; the corruption only
170    ///    surfaces inside the per-entry decode helper.
171    ///
172    /// PIT drops the corrupt record itself, the remaining records
173    /// of that run, and every level not yet read. The same rule
174    /// applies to the `blob_files` section. Clean tail-truncation
175    /// is still tolerated, same as
176    /// [`TolerateCorruptedTailRecords`](Self::TolerateCorruptedTailRecords).
177    PointInTimeRecovery,
178
179    /// Skip each corrupt record individually, keep all others.
180    /// Maximum-availability, lossy. On any per-record decode
181    /// mismatch — framing-layer XXH3 mismatch, payload-decode
182    /// failure inside an otherwise-framed-OK record (e.g.
183    /// `Error::InvalidTag` on a corrupt `checksum_type` byte), or
184    /// a framing-header `BadHeader` — the reader logs the skip
185    /// and advances exactly past the bad record using the
186    /// framing-supplied length field. If the length field itself
187    /// is unusable (the recorded length is outside the legal
188    /// range, so the next-record boundary is unknown), the rest
189    /// of that section is dropped. Intended companion to the
190    /// `repair_db` tooling tracked as `#303`: this mode recovers
191    /// what it can in-place; `repair_db` rebuilds the manifest
192    /// from the SST files
193    /// themselves when even this mode can't reach a usable state.
194    SkipAnyCorruptedRecords,
195}
196
197/// LSM-tree type
198#[derive(Copy, Clone, Debug, PartialEq, Eq)]
199pub enum TreeType {
200    /// Standard LSM-tree, see [`Tree`]
201    Standard,
202
203    /// Key-value separated LSM-tree, see [`BlobTree`]
204    Blob,
205}
206
207impl From<TreeType> for u8 {
208    fn from(val: TreeType) -> Self {
209        match val {
210            TreeType::Standard => 0,
211            TreeType::Blob => 1,
212        }
213    }
214}
215
216impl TryFrom<u8> for TreeType {
217    type Error = ();
218
219    fn try_from(value: u8) -> Result<Self, Self::Error> {
220        match value {
221            0 => Ok(Self::Standard),
222            1 => Ok(Self::Blob),
223            _ => Err(()),
224        }
225    }
226}
227
228#[cfg_attr(
229    not(feature = "std"),
230    allow(
231        dead_code,
232        reason = "default data-folder path used only on the std-gated default-config path"
233    )
234)]
235const DEFAULT_FILE_FOLDER: &str = ".lsm.data";
236
237/// Options for key-value separation
238#[derive(Clone, Debug, PartialEq)]
239pub struct KvSeparationOptions {
240    /// What type of compression is used for blobs
241    #[doc(hidden)]
242    pub compression: CompressionType,
243
244    /// Blob file target size in bytes
245    #[doc(hidden)]
246    pub file_target_size: u64,
247
248    /// Key-value separation threshold in bytes
249    #[doc(hidden)]
250    pub separation_threshold: u32,
251
252    #[doc(hidden)]
253    pub staleness_threshold: f32,
254
255    #[doc(hidden)]
256    pub age_cutoff: f32,
257
258    /// Pre-trained zstd dictionary for blob-file dictionary compression.
259    ///
260    /// Required when `compression` is [`CompressionType::ZstdDict`].
261    /// The `dict_id` in the compression type must match [`ZstdDictionary::id`](crate::ZstdDictionary::id).
262    #[cfg(zstd_any)]
263    #[doc(hidden)]
264    pub zstd_dictionary: Option<alloc::sync::Arc<crate::compression::ZstdDictionary>>,
265}
266
267impl Default for KvSeparationOptions {
268    fn default() -> Self {
269        Self {
270            #[cfg(feature="lz4")]
271            compression:   CompressionType::Lz4,
272
273            #[cfg(not(feature="lz4"))]
274            compression: CompressionType::None,
275
276            file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024,
277            separation_threshold: /* 1 KiB */ 1_024,
278
279            staleness_threshold: 0.25,
280            age_cutoff: 0.25,
281
282            #[cfg(zstd_any)]
283            zstd_dictionary: None,
284        }
285    }
286}
287
288impl KvSeparationOptions {
289    /// Sets the blob compression method.
290    #[must_use]
291    pub fn compression(mut self, compression: CompressionType) -> Self {
292        self.compression = compression;
293        self
294    }
295
296    /// Sets the target size of blob files.
297    ///
298    /// Smaller blob files allow more granular garbage collection
299    /// which allows lower space amp for lower write I/O cost.
300    ///
301    /// Larger blob files decrease the number of files on disk and maintenance
302    /// overhead.
303    ///
304    /// Defaults to 64 MiB.
305    #[must_use]
306    pub fn file_target_size(mut self, bytes: u64) -> Self {
307        self.file_target_size = bytes;
308        self
309    }
310
311    /// Sets the key-value separation threshold in bytes.
312    ///
313    /// Smaller value will reduce compaction overhead and thus write amplification,
314    /// at the cost of lower read performance.
315    ///
316    /// Defaults to 1 KiB.
317    #[must_use]
318    pub fn separation_threshold(mut self, bytes: u32) -> Self {
319        self.separation_threshold = bytes;
320        self
321    }
322
323    /// Sets the staleness threshold percentage.
324    ///
325    /// The staleness percentage determines how much a blob file needs to be fragmented to be
326    /// picked up by the garbage collection.
327    ///
328    /// Defaults to 33%.
329    #[must_use]
330    pub fn staleness_threshold(mut self, ratio: f32) -> Self {
331        self.staleness_threshold = ratio;
332        self
333    }
334
335    /// Sets the age cutoff threshold.
336    ///
337    /// Defaults to 20%.
338    #[must_use]
339    pub fn age_cutoff(mut self, ratio: f32) -> Self {
340        self.age_cutoff = ratio;
341        self
342    }
343
344    /// Sets the zstd dictionary for blob-file dictionary compression.
345    ///
346    /// Required when [`compression`](Self::compression) is set to
347    /// [`CompressionType::ZstdDict`].  The `dict_id` encoded in the
348    /// compression type must equal [`ZstdDictionary::id()`](crate::ZstdDictionary::id) of the
349    /// supplied dictionary; [`Config::open`] will return
350    /// [`Error::ZstdDictMismatch`](crate::Error::ZstdDictMismatch) if
351    /// they disagree.
352    #[cfg(zstd_any)]
353    #[must_use]
354    pub fn dict(
355        mut self,
356        dictionary: alloc::sync::Arc<crate::compression::ZstdDictionary>,
357    ) -> Self {
358        self.zstd_dictionary = Some(dictionary);
359        self
360    }
361}
362
363/// Tree configuration builder
364pub struct Config {
365    /// Folder path
366    #[doc(hidden)]
367    pub path: PathBuf,
368
369    /// Default filesystem backend for levels without an explicit route.
370    ///
371    /// Defaults to [`StdFs`]. Use [`Config::with_fs`] to plug in an
372    /// alternative backend such as [`MemFs`](crate::fs::MemFs).
373    ///
374    /// Both fresh tree creation and reopening (recovery) are supported
375    /// for any backend that implements [`Fs`].
376    #[doc(hidden)]
377    pub fs: Arc<dyn Fs>,
378
379    /// Per-level filesystem routing for tiered storage.
380    ///
381    /// When set, tables at different LSM levels can be stored on different
382    /// storage devices (e.g., NVMe for L0–L1, SSD for L2–L4, HDD for L5–L6).
383    /// Each entry maps a range of levels to a base directory and filesystem
384    /// backend. Uncovered levels fall back to the primary `path` and `fs`.
385    ///
386    /// Zero additional overhead when `None` — only a single branch check;
387    /// path construction allocations are unchanged.
388    #[doc(hidden)]
389    pub level_routes: Option<Vec<LevelRoute>>,
390
391    /// Block cache to use
392    #[doc(hidden)]
393    pub cache: Arc<Cache>,
394
395    /// Descriptor table to use
396    #[doc(hidden)]
397    pub descriptor_table: Option<Arc<DescriptorTable>>,
398
399    /// Number of levels of the LSM tree (depth of tree)
400    ///
401    /// Once set, the level count is fixed (in the "manifest" file)
402    pub level_count: u8,
403
404    /// What type of compression is used for data blocks
405    pub data_block_compression_policy: CompressionPolicy,
406
407    /// What type of compression is used for index blocks
408    pub index_block_compression_policy: CompressionPolicy,
409
410    /// Restart interval inside data blocks
411    pub data_block_restart_interval_policy: RestartIntervalPolicy,
412
413    /// Restart interval inside index blocks
414    pub index_block_restart_interval_policy: RestartIntervalPolicy,
415
416    /// Block size of data blocks
417    pub data_block_size_policy: BlockSizePolicy,
418
419    /// Whether to pin index blocks
420    pub index_block_pinning_policy: PinningPolicy,
421
422    /// Whether to pin filter blocks
423    pub filter_block_pinning_policy: PinningPolicy,
424
425    /// Whether to pin top level index of partitioned index
426    pub top_level_index_block_pinning_policy: PinningPolicy,
427
428    /// Whether to pin top level index of partitioned filter
429    pub top_level_filter_block_pinning_policy: PinningPolicy,
430
431    /// Data block hash ratio
432    pub data_block_hash_ratio_policy: HashRatioPolicy,
433
434    /// Whether to partition index blocks
435    pub index_block_partitioning_policy: PartitioningPolicy,
436
437    /// Whether to partition filter blocks
438    pub filter_block_partitioning_policy: PartitioningPolicy,
439
440    /// Partition size when using partitioned indexes
441    pub index_block_partition_size_policy: BlockSizePolicy,
442
443    /// Partition size when using partitioned filters
444    pub filter_block_partition_size_policy: BlockSizePolicy,
445
446    /// If `true`, the last level will not build filters, reducing the filter size of a database
447    /// by ~90% typically
448    pub(crate) expect_point_read_hits: bool,
449
450    /// Per-block Page ECC. When `true`, every block on disk carries a parity
451    /// trailer; on read, if the block's XXH3 disagrees with the on-disk bytes,
452    /// the reader attempts recovery from the trailer before surfacing the
453    /// corruption. The correction scheme is selected at runtime
454    /// (`update_runtime_config`): per-word SEC-DED (the default), single XOR
455    /// parity, or Reed-Solomon. Requires the `page_ecc` cargo feature — opening a
456    /// tree with `page_ecc = true` on a build without the feature returns
457    /// [`crate::Error::PageEccUnsupported`].
458    ///
459    /// Off by default. `RocksDB` ships per-block ECC as an operator-
460    /// chosen knob (typically off on RAID-protected media, on on
461    /// single-drive) and the cost is non-trivial on the write path,
462    /// so the default keeps the existing behaviour.
463    pub(crate) page_ecc: bool,
464
465    /// Initial [`crate::runtime_config::RuntimeConfig`] snapshot
466    /// the tree starts with. Seeds both the first
467    /// `persist_version` call and the Tree's
468    /// `RuntimeConfigHandle`, so a non-default value supplied via
469    /// [`Config::with_runtime_config`] is honoured from byte zero
470    /// of the manifest. Defaults to `RuntimeConfig::default()` —
471    /// matches the pre-existing implicit behaviour.
472    #[expect(
473        clippy::struct_field_names,
474        reason = "name mirrors the type for grep-ability across the persist + Tree handle init wiring"
475    )]
476    pub(crate) initial_runtime_config: crate::runtime_config::RuntimeConfig,
477
478    /// Filter construction policy
479    pub filter_policy: FilterPolicy,
480
481    /// Retrieval-ribbon locator policy (per level). Defaults to
482    /// [`LocatorPolicy::block_level`]: written SSTs carry an optional `locator`
483    /// section mapping each key to its data block for O(1) point reads (skipping
484    /// the index-block binary search). Set [`LocatorPolicy::disabled`] to opt
485    /// out — disabled levels produce byte-identical SSTs (no section).
486    pub locator_policy: LocatorPolicy,
487
488    /// Compaction filter factory
489    pub compaction_filter_factory: Option<Arc<dyn Factory>>,
490
491    /// Prefix extractor for prefix bloom filters.
492    ///
493    /// When set, the bloom filter indexes extracted prefixes in addition to
494    /// full keys, allowing prefix scans to skip segments that contain no
495    /// matching prefixes.
496    pub prefix_extractor: Option<Arc<dyn PrefixExtractor>>,
497
498    /// Merge operator for commutative operations
499    ///
500    /// When set, enables `merge()` operations that store partial updates
501    /// which are lazily combined during reads and compaction.
502    pub merge_operator: Option<Arc<dyn MergeOperator>>,
503
504    #[doc(hidden)]
505    pub kv_separation_opts: Option<KvSeparationOptions>,
506
507    /// Custom user key comparator.
508    ///
509    /// When set, all key comparisons use this comparator instead of the
510    /// default lexicographic byte ordering. Once a tree is opened with a
511    /// comparator, it must always be re-opened with the same comparator.
512    // Not `pub` — use `Config::comparator()` builder method as the public API.
513    #[doc(hidden)]
514    pub(crate) comparator: SharedComparator,
515
516    /// Block-level encryption provider for encryption at rest.
517    ///
518    /// When set, all blocks (data, index, filter, meta) are encrypted
519    /// using this provider after compression and before checksumming.
520    pub(crate) encryption: Option<Arc<dyn EncryptionProvider>>,
521
522    /// Policy governing what `Tree::open` does when the on-disk
523    /// MANIFEST contains corrupt records. Defaults to
524    /// [`ManifestRecoveryMode::AbsoluteConsistency`], the only
525    /// production-safe choice — any corruption aborts the open. Other
526    /// modes trade strict correctness for partial-availability after a
527    /// disaster; see the enum doc for the operational scenarios that
528    /// motivate each mode.
529    pub(crate) manifest_recovery_mode: ManifestRecoveryMode,
530
531    /// Durability level for every fsync the tree issues (SST writes,
532    /// manifest, version persist, directory syncs).
533    ///
534    /// Defaults to [`SyncMode::Normal`] (plain `fsync`), matching the
535    /// out-of-the-box durability of `RocksDB` and `SQLite`. Only observable on
536    /// macOS, where [`SyncMode::Full`] opts into the much slower
537    /// `F_FULLFSYNC` barrier; on other platforms both modes are plain
538    /// `fsync`. Set via [`Config::sync_mode`].
539    pub(crate) sync_mode: SyncMode,
540
541    /// When `true` (the default), [`Config::open`] and [`Config::repair`]
542    /// acquire an exclusive cross-process lock on a `LOCK` file in the tree
543    /// directory (an advisory OS file lock) and hold it for the lifetime of the
544    /// [`Tree`] (open) or the duration of the call (repair). A
545    /// second process attempting to open / repair the same directory fails fast
546    /// with [`Error::Locked`](crate::Error::Locked) instead of racing on the
547    /// manifest. Set `false` via [`Config::with_directory_lock`] only when the
548    /// embedder already enforces exclusive directory ownership at a higher layer
549    /// (e.g. a keyspace / journal manager). Best-effort per `Fs` backend: real
550    /// on-disk backends enforce it, in-memory backends are single-process and
551    /// satisfy it vacuously.
552    pub(crate) directory_lock: bool,
553
554    /// Edit-log size (bytes) past which the next manifest persist rotates: it
555    /// writes a fresh full snapshot and starts an empty log instead of appending
556    /// another [`VersionEdit`](crate::version::edit::VersionEdit). Bounds both
557    /// recovery replay time (edits to re-apply) and log disk use, while keeping
558    /// the common per-flush path a tiny `O(changed-levels)` append rather than an
559    /// `O(all-SSTs)` full manifest rewrite.
560    ///
561    /// Defaults to 1 MiB (≈ tens of thousands of edits). Set via
562    /// [`Config::manifest_log_rotate_bytes`]. A smaller value rotates more
563    /// often (shorter recovery, more frequent full-snapshot writes); `0` rotates
564    /// on every upgrade, degenerating to the full-rewrite-per-version behaviour.
565    pub(crate) manifest_log_rotate_bytes: u64,
566
567    /// Compaction I/O rate limit in bytes per second.
568    ///
569    /// Caps the rate at which the compaction worker is allowed to issue
570    /// I/O, so background compaction cannot saturate the device and starve
571    /// user point reads / range scans (P99 stability). `0` (the default)
572    /// means unlimited — no throttling, no behaviour change. Flush and
573    /// user reads are never throttled, only compaction. Set via
574    /// [`Config::compaction_rate_limit`].
575    pub(crate) compaction_rate_limit: u64,
576
577    /// Worker-thread count for compaction parallelism (`std` only), used two
578    /// ways: it sizes the per-tree block-compression pool built at open when
579    /// [`Self::compaction_pool`] is `None`, and it caps how many range-parallel
580    /// sub-compactions a single compaction is split into. Default
581    /// `max(1, available_parallelism / 2)` — leaves half the cores for
582    /// application work. `1` forces the serial path for both. Without the
583    /// `parallel` feature there is no built-in pool, so block compression and
584    /// sub-compaction ranges run serially even for a value > 1. Set via
585    /// [`Config::compaction_threads`].
586    #[cfg(feature = "std")] // no-std: parallel compaction unavailable (no threads)
587    pub(crate) compaction_threads: usize,
588
589    /// Optional shared compaction thread pool. `None` (default) = a per-tree
590    /// pool is built at [`crate::Tree::open`] sized by [`Self::compaction_threads`]
591    /// (predictable, matches the per-DB pattern). `Some` = caller-supplied
592    /// executor shared across every tree holding this `Arc`, bounding total
593    /// threads regardless of tree count. Set via [`Config::compaction_pool`].
594    #[cfg(feature = "std")]
595    pub(crate) compaction_pool: Option<Arc<dyn crate::table::writer::CompactionSpawner>>,
596
597    /// Minimum total input size (bytes) for a compaction to be split into
598    /// parallel sub-compactions. Below it the compaction stays single-threaded
599    /// (per-thread setup + extra output tables outweigh the parallelism on small
600    /// compactions). Default
601    /// [`SUBCOMPACTION_MIN_INPUT_BYTES`](crate::compaction::worker::SUBCOMPACTION_MIN_INPUT_BYTES)
602    /// (8 MiB). Set via [`Config::subcompaction_min_bytes`].
603    #[cfg(feature = "std")]
604    pub(crate) subcompaction_min_bytes: u64,
605
606    /// Test-only failpoint: when armed, the first parallel sub-compaction range
607    /// that observes it returns an error and disarms it, so the crash-safety
608    /// rollback paths (sibling output rollback, input restore) can be exercised
609    /// deterministically. Behind `cfg(test)`, never compiled into release builds.
610    #[cfg(all(test, feature = "std"))]
611    pub(crate) fail_one_subcompaction: Arc<core::sync::atomic::AtomicBool>,
612
613    /// Test-only failpoint: when armed, a tight-space compaction returns an error
614    /// immediately after durably installing (and punching) its FIRST slice, so
615    /// the crash-mid-loop recovery path (reopen a tree whose manifest carries a
616    /// persisted input restriction) can be exercised deterministically. Behind
617    /// `cfg(test)`, never compiled into release builds.
618    #[cfg(all(test, feature = "std"))]
619    pub(crate) fail_tight_after_first_slice: Arc<core::sync::atomic::AtomicBool>,
620
621    /// Pre-trained zstd dictionary for dictionary compression.
622    ///
623    /// When set together with a [`CompressionType::ZstdDict`] compression
624    /// policy, data blocks are compressed using this dictionary. The
625    /// dictionary must remain the same for the lifetime of the tree —
626    /// opening a tree with a different dictionary will produce
627    /// [`Error::ZstdDictMismatch`](crate::Error::ZstdDictMismatch) errors.
628    #[cfg(zstd_any)]
629    pub(crate) zstd_dictionary: Option<Arc<crate::compression::ZstdDictionary>>,
630
631    /// The global sequence number generator.
632    ///
633    /// Should be shared between multiple trees of a database.
634    pub(crate) seqno: SharedSequenceNumberGenerator,
635
636    /// Sequence number watermark that is visible to readers.
637    ///
638    /// Used for MVCC snapshots and to control which updates are
639    /// observable in a given view of the database.
640    pub(crate) visible_seqno: SharedSequenceNumberGenerator,
641}
642
643// TODO: remove default?
644// std-only: the default backend is `StdFs` and the default path is resolved
645// via std::path::absolute. no_std callers construct `Config` explicitly with a
646// caller-provided `Fs`.
647#[cfg(feature = "std")]
648impl Default for Config {
649    fn default() -> Self {
650        Self {
651            path: absolute_path(Path::new(DEFAULT_FILE_FOLDER)),
652            fs: Arc::new(StdFs),
653            level_routes: None,
654            descriptor_table: Some(Arc::new(DescriptorTable::new(256))),
655            seqno: SharedSequenceNumberGenerator::from(SequenceNumberCounter::default()),
656            visible_seqno: SharedSequenceNumberGenerator::from(SequenceNumberCounter::default()),
657
658            cache: Arc::new(Cache::with_capacity_bytes(
659                /* 16 MiB */ 16 * 1_024 * 1_024,
660            )),
661
662            data_block_restart_interval_policy: RestartIntervalPolicy::all(16),
663            index_block_restart_interval_policy: RestartIntervalPolicy::all(1),
664
665            level_count: DEFAULT_LEVEL_COUNT,
666
667            data_block_size_policy: BlockSizePolicy::all(4_096),
668
669            index_block_pinning_policy: PinningPolicy::new([true, true, false]),
670            filter_block_pinning_policy: PinningPolicy::new([true, false]),
671
672            top_level_index_block_pinning_policy: PinningPolicy::all(true), // TODO: implement
673            top_level_filter_block_pinning_policy: PinningPolicy::all(true), // TODO: implement
674
675            // Partitioned at every level so a bit-flip inside one
676            // sub-index block only takes out the keys covered by that
677            // partition, not the entire SST. A full-index SST has no
678            // within-block redundancy: one corrupt byte in the single
679            // index block makes every data block in the table
680            // unreachable. See tests/partitioned_index_blast_radius.rs
681            // for the isolation property this default relies on.
682            index_block_partitioning_policy: PinningPolicy::all(true),
683            // Filter-block default intentionally left at the pre-#329
684            // shape (L3+ only). A corrupt filter block can produce a
685            // false negative (filter says "not present" → read short-
686            // circuits → caller misses an existing key), which is a
687            // correctness hazard distinct from index corruption (where
688            // the read fails loudly). Flipping this default is tracked
689            // as a separate decision pending a filter blast-radius /
690            // false-negative analysis; symmetry with index is not
691            // sufficient justification on its own.
692            filter_block_partitioning_policy: PinningPolicy::new([false, false, false, true]),
693
694            index_block_partition_size_policy: BlockSizePolicy::all(4_096), // TODO: implement
695            filter_block_partition_size_policy: BlockSizePolicy::all(4_096), // TODO: implement
696
697            data_block_compression_policy: ({
698                #[cfg(feature = "lz4")]
699                let c = CompressionPolicy::new([CompressionType::None, CompressionType::Lz4]);
700
701                #[cfg(not(feature = "lz4"))]
702                let c = CompressionPolicy::new([CompressionType::None]);
703
704                c
705            }),
706            index_block_compression_policy: CompressionPolicy::all(CompressionType::None),
707
708            data_block_hash_ratio_policy: HashRatioPolicy::all(0.0),
709
710            locator_policy: LocatorPolicy::block_level(),
711            filter_policy: FilterPolicy::all(FilterPolicyEntry::Bloom(
712                BloomConstructionPolicy::BitsPerKey(10.0),
713            )),
714
715            compaction_filter_factory: None,
716            merge_operator: None,
717
718            prefix_extractor: None,
719
720            expect_point_read_hits: false,
721
722            page_ecc: false,
723
724            initial_runtime_config: crate::runtime_config::RuntimeConfig::default(),
725
726            kv_separation_opts: None,
727
728            #[cfg(zstd_any)]
729            zstd_dictionary: None,
730
731            comparator: comparator::default_comparator(),
732            encryption: None,
733            manifest_recovery_mode: ManifestRecoveryMode::AbsoluteConsistency,
734            sync_mode: SyncMode::Normal,
735            directory_lock: true,
736            manifest_log_rotate_bytes: 1024 * 1024,
737            compaction_rate_limit: 0,
738
739            #[cfg(feature = "std")]
740            compaction_threads: std::thread::available_parallelism()
741                .map_or(1, |n| (n.get() / 2).max(1)),
742            #[cfg(feature = "std")]
743            compaction_pool: None,
744            #[cfg(feature = "std")]
745            subcompaction_min_bytes: crate::compaction::worker::SUBCOMPACTION_MIN_INPUT_BYTES,
746            #[cfg(all(test, feature = "std"))]
747            fail_one_subcompaction: Arc::new(core::sync::atomic::AtomicBool::new(false)),
748            #[cfg(all(test, feature = "std"))]
749            fail_tight_after_first_slice: Arc::new(core::sync::atomic::AtomicBool::new(false)),
750        }
751    }
752}
753
754/// Name of the lock file created in a tree directory for the cross-process
755/// exclusive directory lock.
756#[cfg_attr(
757    not(feature = "std"),
758    allow(
759        dead_code,
760        reason = "directory-lock filename used only by the std-gated lock-acquisition path"
761    )
762)]
763pub(crate) const DIRECTORY_LOCK_FILE: &str = "LOCK";
764
765/// Acquires the cross-process exclusive directory lock when `enabled`.
766///
767/// Opens (creating if absent) a `LOCK` file under `dir` and takes a
768/// non-blocking exclusive advisory lock on it through the `Fs` backend. Returns
769/// the locked handle to hold for as long as exclusivity is required; dropping it
770/// releases the lock (the OS frees an advisory lock when the fd / handle
771/// closes). `Ok(None)` when `enabled` is false. Fails with
772/// [`Error::Locked`](crate::Error::Locked) when another live instance holds the
773/// lock. The directory must already exist (the caller creates it for a fresh
774/// tree before acquiring).
775#[cfg(feature = "std")]
776pub(crate) fn acquire_directory_lock(
777    fs: &dyn Fs,
778    dir: &Path,
779    enabled: bool,
780) -> crate::Result<Option<Box<dyn crate::fs::FsFile>>> {
781    if !enabled {
782        return Ok(None);
783    }
784    let lock_path = dir.join(DIRECTORY_LOCK_FILE);
785    let file = fs.open(
786        &lock_path,
787        &crate::fs::FsOpenOptions::new()
788            .read(true)
789            .write(true)
790            .create(true),
791    )?;
792    if file.try_lock_exclusive()? {
793        Ok(Some(file))
794    } else {
795        Err(crate::Error::Locked(dir.display().to_string()))
796    }
797}
798
799impl Config {
800    /// Initializes a new config
801    // std-only: seeds the remaining fields from `Config::default`, whose
802    // default `Fs` is `StdFs`. no_std callers build `Config` field-by-field
803    // with a caller-provided `Fs`.
804    #[cfg(feature = "std")]
805    pub fn new<P: AsRef<Path>>(
806        path: P,
807        seqno: SequenceNumberCounter,
808        visible_seqno: SequenceNumberCounter,
809    ) -> Self {
810        Self {
811            path: absolute_path(path.as_ref()),
812            seqno: Arc::new(seqno),
813            visible_seqno: Arc::new(visible_seqno),
814            ..Default::default()
815        }
816    }
817
818    /// Sets the default filesystem backend used for levels without an explicit route.
819    ///
820    /// Defaults to [`StdFs`]. Use [`MemFs`](crate::fs::MemFs) for
821    /// in-memory trees (testing, ephemeral indexes).
822    ///
823    /// # Example
824    ///
825    /// ```
826    /// # fn main() -> lsm_tree::Result<()> {
827    /// use lsm_tree::{Config, SequenceNumberCounter};
828    /// use lsm_tree::fs::MemFs;
829    ///
830    /// let tree = Config::new(
831    ///     "/virtual/tree",
832    ///     SequenceNumberCounter::default(),
833    ///     SequenceNumberCounter::default(),
834    /// )
835    /// .with_fs(MemFs::new())
836    /// .open()?;
837    /// # Ok(())
838    /// # }
839    /// ```
840    #[must_use]
841    pub fn with_fs<F: Fs>(mut self, fs: F) -> Self {
842        self.fs = Arc::new(fs);
843        self
844    }
845
846    /// Sets the default filesystem backend from an existing shared handle.
847    ///
848    /// Useful when multiple configs should reuse the same backend
849    /// instance, including trait objects and backends that are not `Clone`.
850    ///
851    #[must_use]
852    pub fn with_shared_fs(mut self, fs: Arc<dyn Fs>) -> Self {
853        self.fs = fs;
854        self
855    }
856
857    /// Opens a tree using the config.
858    ///
859    /// # Errors
860    ///
861    /// Will return `Err` if an IO error occurs.
862    /// Returns [`Error::ZstdDictMismatch`](crate::Error::ZstdDictMismatch) if
863    /// the compression policy references a `dict_id` that doesn't match the
864    /// configured dictionary.
865    pub fn open(self) -> crate::Result<AnyTree> {
866        #[cfg(zstd_any)]
867        self.validate_zstd_dictionary()?;
868
869        // On a zstd build the live block path seals encrypted blocks through
870        // the AAD-bound envelope, so the configured provider MUST implement it.
871        // Reject an opaque-only provider here, at open time, instead of letting
872        // it fail on the first encrypted read/write.
873        #[cfg(zstd_any)]
874        if self
875            .encryption
876            .as_ref()
877            .is_some_and(|enc| !enc.supports_aad_block_path())
878        {
879            return Err(crate::Error::Encrypt(
880                "encryption provider does not implement the AAD-bound block path \
881                 (encrypt_block_aad / decrypt_block_aad) required for encrypted \
882                 blocks on a zstd build",
883            ));
884        }
885
886        Ok(if self.kv_separation_opts.is_some() {
887            AnyTree::Blob(BlobTree::open(self)?)
888        } else {
889            AnyTree::Standard(Tree::open(self)?)
890        })
891    }
892
893    /// Validates that every `ZstdDict` entry in compression policies references
894    /// a `dict_id` that matches the configured dictionary. Catches mismatches
895    /// at open time rather than at first block write/read.
896    #[cfg(zstd_any)]
897    fn validate_zstd_dictionary(&self) -> crate::Result<()> {
898        let dict_id = self.zstd_dictionary.as_ref().map(|d| d.id());
899
900        // NOTE: Only data block policies are validated. Index blocks never
901        // carry a dictionary — Writer::use_index_block_compression() downgrades
902        // ZstdDict to plain Zstd. Validating index policies here would reject
903        // configs that use ZstdDict solely for index blocks even though the
904        // writer handles them correctly.
905        for ct in self.data_block_compression_policy.iter() {
906            if let &CompressionType::ZstdDict {
907                dict_id: required, ..
908            } = ct
909            {
910                match dict_id {
911                    None => {
912                        return Err(crate::Error::ZstdDictMismatch {
913                            expected: required,
914                            got: None,
915                        });
916                    }
917                    Some(actual) if actual != required => {
918                        return Err(crate::Error::ZstdDictMismatch {
919                            expected: required,
920                            got: Some(actual),
921                        });
922                    }
923                    _ => {}
924                }
925            }
926        }
927
928        // Blob files with ZstdDict compression must have a matching dictionary.
929        if let Some(ref kv_opts) = self.kv_separation_opts
930            && let CompressionType::ZstdDict {
931                dict_id: required, ..
932            } = kv_opts.compression
933        {
934            match kv_opts.zstd_dictionary.as_ref().map(|d| d.id()) {
935                None => {
936                    return Err(crate::Error::ZstdDictMismatch {
937                        expected: required,
938                        got: None,
939                    });
940                }
941                Some(actual) if actual != required => {
942                    return Err(crate::Error::ZstdDictMismatch {
943                        expected: required,
944                        got: Some(actual),
945                    });
946                }
947                _ => {}
948            }
949        }
950
951        Ok(())
952    }
953
954    /// Like [`Config::new`], but accepts pre-built shared generators.
955    ///
956    /// This is useful when the caller already has
957    /// [`SharedSequenceNumberGenerator`] instances (e.g., from a higher-level
958    /// database that shares generators across multiple trees).
959    // std-only: see [`Config::new`] — seeds via `Config::default` (`StdFs`).
960    #[cfg(feature = "std")]
961    pub fn new_with_generators<P: AsRef<Path>>(
962        path: P,
963        seqno: SharedSequenceNumberGenerator,
964        visible_seqno: SharedSequenceNumberGenerator,
965    ) -> Self {
966        Self {
967            path: absolute_path(path.as_ref()),
968            seqno,
969            visible_seqno,
970            ..Default::default()
971        }
972    }
973}
974
975#[cfg(all(test, zstd_any))]
976mod tests;
977
978impl Config {
979    /// Returns the tables folder path and [`Fs`] backend for the given level.
980    ///
981    /// If [`level_routes`](Self::level_routes) has an entry covering this
982    /// level, uses that entry's path and `Fs`. Otherwise falls back to the
983    /// primary [`path`](Self::path) and [`fs`](Self::fs).
984    #[must_use]
985    pub fn tables_folder_for_level(&self, level: u8) -> (PathBuf, Arc<dyn Fs>) {
986        if let Some(routes) = &self.level_routes {
987            for route in routes {
988                if route.levels.contains(&level) {
989                    return (route.path.join(TABLES_FOLDER), route.fs.clone());
990                }
991            }
992        }
993        (self.path.join(TABLES_FOLDER), self.fs.clone())
994    }
995
996    /// Best-effort minimum free space (bytes) across every filesystem this tree
997    /// writes to: the primary [`path`](Self::path) plus each
998    /// [`level_routes`](Self::level_routes) volume.
999    ///
1000    /// The tightest volume bounds storage admission and compaction space gating,
1001    /// since a full routed (cold-tier) volume fails a flush / compaction
1002    /// targeting it even while the primary still has room. A backend that cannot
1003    /// report free space (or an I/O hiccup) contributes `u64::MAX`, so a probe
1004    /// failure never fabricates disk pressure.
1005    #[must_use]
1006    pub(crate) fn min_available_space(&self) -> u64 {
1007        let mut free = self.fs.available_space(&self.path).unwrap_or(u64::MAX);
1008        if let Some(routes) = &self.level_routes {
1009            for route in routes {
1010                free = free.min(route.fs.available_space(&route.path).unwrap_or(u64::MAX));
1011            }
1012        }
1013        free
1014    }
1015
1016    /// Returns all unique tables folders that need to be scanned during
1017    /// recovery: the primary folder plus every [`LevelRoute`] folder.
1018    #[must_use]
1019    pub fn all_tables_folders(&self) -> Vec<(PathBuf, Arc<dyn Fs>)> {
1020        let primary_fs: Arc<dyn Fs> = self.fs.clone();
1021        let mut folders: Vec<(PathBuf, Arc<dyn Fs>)> =
1022            vec![(self.path.join(TABLES_FOLDER), primary_fs)];
1023
1024        if let Some(routes) = &self.level_routes {
1025            for route in routes {
1026                let folder = route.path.join(TABLES_FOLDER);
1027                // Dedup by path: scanning the same directory twice would cause
1028                // already-recovered tables to be classified as orphans and
1029                // deleted. Routing the same path through different Fs backends
1030                // is a configuration error (level_routes validation in
1031                // Config::level_routes rejects overlapping ranges).
1032                if !folders.iter().any(|(p, _)| *p == folder) {
1033                    folders.push((folder, route.fs.clone()));
1034                }
1035            }
1036        }
1037
1038        folders
1039    }
1040
1041    /// Configures per-level filesystem routing for tiered storage.
1042    ///
1043    /// Each [`LevelRoute`] maps a range of LSM levels to a base directory
1044    /// and filesystem backend. Levels not covered by any route fall back to
1045    /// the primary `path` and `fs`.
1046    ///
1047    /// # Reopen contract
1048    ///
1049    /// The route configuration is **not persisted** in the manifest.
1050    /// On reopen, the [`Config`] must specify `level_routes` such that
1051    /// [`all_tables_folders`](Self::all_tables_folders) includes every
1052    /// directory and filesystem pair that may contain existing SST files
1053    /// for this tree.
1054    ///
1055    /// Changing the mapping from levels to paths is allowed as long as
1056    /// the previously used folders remain covered. If old folders are
1057    /// omitted, recovery may fail with
1058    /// [`RouteMismatch`](crate::Error::RouteMismatch) (when all missing
1059    /// tables are on uncovered levels) or
1060    /// [`Unrecoverable`](crate::Error::Unrecoverable) (when some missing
1061    /// tables are on levels that are still covered).
1062    ///
1063    /// # Panics
1064    ///
1065    /// Panics if any route has an empty range or if any two routes have
1066    /// overlapping level ranges.
1067    #[must_use]
1068    pub fn level_routes(mut self, routes: Vec<LevelRoute>) -> Self {
1069        // Validate no empty/inverted ranges
1070        for route in &routes {
1071            assert!(
1072                route.levels.start < route.levels.end,
1073                "empty or inverted level route range: {:?}",
1074                route.levels,
1075            );
1076        }
1077
1078        // Validate no overlapping ranges
1079        for (i, a) in routes.iter().enumerate() {
1080            for b in routes.iter().skip(i + 1) {
1081                assert!(
1082                    a.levels.end <= b.levels.start || b.levels.end <= a.levels.start,
1083                    "overlapping level routes: {:?} and {:?}",
1084                    a.levels,
1085                    b.levels,
1086                );
1087            }
1088        }
1089        self.level_routes = if routes.is_empty() {
1090            None
1091        } else {
1092            // Normalize paths the same way Config::new normalizes self.path
1093            Some(
1094                routes
1095                    .into_iter()
1096                    .map(|mut r| {
1097                        r.path = absolute_path(&r.path);
1098                        r
1099                    })
1100                    .collect(),
1101            )
1102        };
1103        self
1104    }
1105
1106    /// Overrides the sequence number generator.
1107    ///
1108    /// By default, [`SequenceNumberCounter`] is used. This allows plugging in
1109    /// a custom generator (e.g., HLC for distributed databases).
1110    #[must_use]
1111    pub fn seqno_generator(mut self, generator: SharedSequenceNumberGenerator) -> Self {
1112        self.seqno = generator;
1113        self
1114    }
1115
1116    /// Overrides the visible sequence number generator.
1117    #[must_use]
1118    pub fn visible_seqno_generator(mut self, generator: SharedSequenceNumberGenerator) -> Self {
1119        self.visible_seqno = generator;
1120        self
1121    }
1122
1123    /// Sets the global cache.
1124    ///
1125    /// You can create a global [`Cache`] and share it between multiple
1126    /// trees to cap global cache memory usage.
1127    ///
1128    /// Defaults to a cache with 16 MiB of capacity *per tree*.
1129    #[must_use]
1130    pub fn use_cache(mut self, cache: Arc<Cache>) -> Self {
1131        self.cache = cache;
1132        self
1133    }
1134
1135    /// Sets the file descriptor cache.
1136    ///
1137    /// Can be shared across trees.
1138    #[must_use]
1139    pub fn use_descriptor_table(mut self, descriptor_table: Option<Arc<DescriptorTable>>) -> Self {
1140        self.descriptor_table = descriptor_table;
1141        self
1142    }
1143
1144    /// If `true`, the last level will not build filters, reducing the filter size of a database
1145    /// by ~90% typically.
1146    ///
1147    /// **Enable this only if you know that point reads generally are expected to find a key-value pair.**
1148    #[must_use]
1149    pub fn expect_point_read_hits(mut self, b: bool) -> Self {
1150        self.expect_point_read_hits = b;
1151        self
1152    }
1153
1154    /// Enables per-block Page ECC.
1155    ///
1156    /// When enabled, every block written by this tree carries a parity
1157    /// trailer; on read, if the block's XXH3 disagrees with the on-disk
1158    /// bytes, the reader attempts recovery from the trailer before surfacing
1159    /// the corruption. The correction scheme defaults to per-word SEC-DED and
1160    /// is selectable at runtime (`update_runtime_config`): per-word SEC-DED,
1161    /// single XOR parity, or Reed-Solomon.
1162    ///
1163    /// Opening a tree with `page_ecc = true` on a build that does not
1164    /// have the `page_ecc` cargo feature enabled returns
1165    /// [`crate::Error::PageEccUnsupported`] at `Tree::open` — the
1166    /// reader has no way to honour the parity trailer without the
1167    /// codec, so silently downgrading integrity is not an option.
1168    ///
1169    /// Wired into the on-disk write path via `MultiWriter::use_page_ecc`
1170    /// at every `Tree::open` / `Tree::ingestion` / compaction-worker
1171    /// `MultiWriter` construction site. With this flag set, every
1172    /// `Block::write_into` call those writers make upgrades its
1173    /// `BlockTransform` to the matching `*Ecc` variant — emitting the
1174    /// configured scheme's parity trailer and setting the `ECC_PARITY` flag
1175    /// in each block header (the trailer length is derived from
1176    /// `data_length`, not stored).
1177    #[must_use]
1178    pub fn page_ecc(mut self, enabled: bool) -> Self {
1179        self.page_ecc = enabled;
1180        self
1181    }
1182
1183    /// Enables or disables the cross-process directory lock (default: enabled).
1184    ///
1185    /// When enabled, [`Config::open`] and [`Config::repair`] acquire an
1186    /// exclusive advisory lock on a `LOCK` file in the tree directory, so a
1187    /// second process opening / repairing the same directory fails fast with
1188    /// [`Error::Locked`](crate::Error::Locked) rather than corrupting the shared
1189    /// manifest. Disable ONLY when exclusive directory ownership is already
1190    /// guaranteed at a higher layer (e.g. an embedding keyspace / journal
1191    /// manager that opens each directory at most once per host).
1192    #[must_use]
1193    pub fn with_directory_lock(mut self, enabled: bool) -> Self {
1194        self.directory_lock = enabled;
1195        self
1196    }
1197
1198    /// Sets the Page ECC scheme used when [`Self::page_ecc`] is enabled.
1199    ///
1200    /// ECC is off until `page_ecc(true)`. When on, this picks the
1201    /// algorithm:
1202    /// [`EccScheme::Secded`](crate::runtime_config::EccScheme::Secded)
1203    /// (per-word single-bit correct / double-bit detect, the default, supported
1204    /// at Block granularity),
1205    /// [`EccScheme::Xor`](crate::runtime_config::EccScheme::Xor) (RAID-5
1206    /// single-parity), or
1207    /// [`EccScheme::ReedSolomon`](crate::runtime_config::EccScheme::ReedSolomon).
1208    /// There is no implicit RS(4,2) default.
1209    #[must_use]
1210    pub fn ecc_scheme(mut self, scheme: crate::runtime_config::EccScheme) -> Self {
1211        self.initial_runtime_config.ecc_scheme = scheme;
1212        self
1213    }
1214
1215    /// Sets whether the writer clears per-file copy-on-write on newly created
1216    /// SST / blob files when the backing filesystem is copy-on-write (Btrfs).
1217    ///
1218    /// Default `true`: write-once SSTs gain no benefit from `CoW` but suffer a
1219    /// fragmentation penalty (~20% write throughput on Btrfs), so clearing it
1220    /// recovers the ext4-equivalent baseline. A no-op on non-`CoW` filesystems.
1221    /// Set `false` to preserve `CoW` (e.g. Btrfs subvolume snapshots that depend
1222    /// on it). See [`crate::runtime_config::RuntimeConfig::disable_cow_on_sst_files`].
1223    #[must_use]
1224    pub fn disable_cow_on_sst_files(mut self, enabled: bool) -> Self {
1225        self.initial_runtime_config.disable_cow_on_sst_files = enabled;
1226        self
1227    }
1228
1229    /// Sets whether [`crate::AbstractTree::create_checkpoint`] clones files via
1230    /// reflink (`FICLONE` / `clonefile`) when the filesystem supports it,
1231    /// falling back to a hard link otherwise.
1232    ///
1233    /// Default `true`: a reflinked checkpoint has an independent inode (no
1234    /// max-links constraint, modifications never touch the original) at O(1)
1235    /// cost via copy-on-write block sharing. A no-op (hard-link path) on
1236    /// filesystems without reflink. See
1237    /// [`crate::runtime_config::RuntimeConfig::use_reflink_for_checkpoint`].
1238    #[must_use]
1239    pub fn use_reflink_for_checkpoint(mut self, enabled: bool) -> Self {
1240        self.initial_runtime_config.use_reflink_for_checkpoint = enabled;
1241        self
1242    }
1243
1244    /// Sets the initial [`crate::runtime_config::RuntimeConfig`]
1245    /// snapshot the tree will start with.
1246    ///
1247    /// Seeds both the first manifest write and the live
1248    /// `RuntimeConfigHandle` exposed via
1249    /// [`crate::Tree::runtime_config`].
1250    ///
1251    /// **Manifest-hardening toggles** in the supplied snapshot
1252    /// that are currently wired through the writer
1253    /// (`manifest_footer_mirror`, `page_ecc` *as consumed by
1254    /// `manifest_blocks::writer` when picking the `BlockTransform`
1255    /// variant*) take effect from byte zero of the on-disk
1256    /// manifest rather than waiting for a post-open
1257    /// [`crate::Tree::update_runtime_config`] call. Subsequent
1258    /// updates still flow through the live handle and apply to
1259    /// the next manifest write.
1260    ///
1261    /// `manifest_kv_checksums` is plumbed in the snapshot but the
1262    /// writer does NOT yet consult or persist it (per-entry
1263    /// framing + footer-flag slot land in a follow-up). Setting
1264    /// it here today has no on-disk effect; it is exposed for
1265    /// forward-compat with no behaviour break.
1266    ///
1267    /// **Note on data-block ECC:** `RuntimeConfig::page_ecc`
1268    /// currently affects manifest Blocks only — data-block ECC is
1269    /// still gated by [`Config::page_ecc`] at tree-open time. The
1270    /// SST writer path consumes the tree-static config, not the
1271    /// runtime handle. Wiring through SST emission is a follow-up.
1272    #[must_use]
1273    pub fn with_runtime_config(mut self, runtime: crate::runtime_config::RuntimeConfig) -> Self {
1274        self.initial_runtime_config = runtime;
1275        self
1276    }
1277
1278    /// Sets the partitioning policy for filter blocks.
1279    #[must_use]
1280    pub fn filter_block_partitioning_policy(mut self, policy: PinningPolicy) -> Self {
1281        self.filter_block_partitioning_policy = policy;
1282        self
1283    }
1284
1285    /// Sets the partitioning policy for index blocks.
1286    #[must_use]
1287    pub fn index_block_partitioning_policy(mut self, policy: PinningPolicy) -> Self {
1288        self.index_block_partitioning_policy = policy;
1289        self
1290    }
1291
1292    /// Sets the pinning policy for filter blocks.
1293    #[must_use]
1294    pub fn filter_block_pinning_policy(mut self, policy: PinningPolicy) -> Self {
1295        self.filter_block_pinning_policy = policy;
1296        self
1297    }
1298
1299    /// Sets the pinning policy for index blocks.
1300    #[must_use]
1301    pub fn index_block_pinning_policy(mut self, policy: PinningPolicy) -> Self {
1302        self.index_block_pinning_policy = policy;
1303        self
1304    }
1305
1306    /// Sets the restart interval inside data blocks.
1307    ///
1308    /// A higher restart interval saves space while increasing lookup times
1309    /// inside data blocks.
1310    ///
1311    /// Default = 16
1312    ///
1313    /// # Panics
1314    ///
1315    /// Panics if any restart interval in `policy` is zero.
1316    #[must_use]
1317    pub fn data_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self {
1318        assert!(
1319            policy.iter().all(|interval| *interval > 0),
1320            "data block restart interval must be greater than zero",
1321        );
1322        self.data_block_restart_interval_policy = policy;
1323        self
1324    }
1325
1326    /// Sets the restart interval inside index blocks.
1327    ///
1328    /// A higher restart interval saves space while increasing lookup times
1329    /// inside index blocks.
1330    ///
1331    /// Default = 1
1332    ///
1333    /// # Panics
1334    ///
1335    /// Panics if any restart interval in `policy` is zero.
1336    #[must_use]
1337    pub fn index_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self {
1338        assert!(
1339            policy.iter().all(|interval| *interval > 0),
1340            "index block restart interval must be greater than zero",
1341        );
1342        self.index_block_restart_interval_policy = policy;
1343        self
1344    }
1345
1346    /// Sets the filter construction policy.
1347    #[must_use]
1348    pub fn filter_policy(mut self, policy: FilterPolicy) -> Self {
1349        self.filter_policy = policy;
1350        self
1351    }
1352
1353    /// Sets the retrieval-ribbon locator policy.
1354    ///
1355    /// On by default at [`LocatorPrecision::Block`] (see
1356    /// [`LocatorPolicy::block_level`]). When enabled for a level, written SSTs on
1357    /// that level carry an optional `locator` section mapping each key to its
1358    /// data block (and, at finer precisions, its slot), letting point reads skip
1359    /// the index-block binary search. Set [`LocatorPolicy::disabled`] to opt out;
1360    /// disabled levels emit byte-identical SSTs.
1361    #[must_use]
1362    pub fn locator_policy(mut self, policy: LocatorPolicy) -> Self {
1363        self.locator_policy = policy;
1364        self
1365    }
1366
1367    /// Sets the compression method for data blocks.
1368    #[must_use]
1369    pub fn data_block_compression_policy(mut self, policy: CompressionPolicy) -> Self {
1370        self.data_block_compression_policy = policy;
1371        self
1372    }
1373
1374    /// Sets the compression method for index blocks.
1375    #[must_use]
1376    pub fn index_block_compression_policy(mut self, policy: CompressionPolicy) -> Self {
1377        self.index_block_compression_policy = policy;
1378        self
1379    }
1380
1381    // TODO: level count is fixed to 7 right now
1382    // /// Sets the number of levels of the LSM tree (depth of tree).
1383    // ///
1384    // /// Defaults to 7, like `LevelDB` and `RocksDB`.
1385    // ///
1386    // /// Cannot be changed once set.
1387    // ///
1388    // /// # Panics
1389    // ///
1390    // /// Panics if `n` is 0.
1391    // #[must_use]
1392    // pub fn level_count(mut self, n: u8) -> Self {
1393    //     assert!(n > 0);
1394
1395    //     self.level_count = n;
1396    //     self
1397    // }
1398
1399    /// Sets the data block size policy.
1400    #[must_use]
1401    pub fn data_block_size_policy(mut self, policy: BlockSizePolicy) -> Self {
1402        self.data_block_size_policy = policy;
1403        self
1404    }
1405
1406    /// Sets the hash ratio policy for data blocks.
1407    ///
1408    /// If greater than 0.0, a hash index is embedded into data blocks that can speed up reads
1409    /// inside the data block.
1410    #[must_use]
1411    pub fn data_block_hash_ratio_policy(mut self, policy: HashRatioPolicy) -> Self {
1412        self.data_block_hash_ratio_policy = policy;
1413        self
1414    }
1415
1416    /// Toggles key-value separation.
1417    #[must_use]
1418    pub fn with_kv_separation(mut self, opts: Option<KvSeparationOptions>) -> Self {
1419        self.kv_separation_opts = opts;
1420        self
1421    }
1422
1423    /// Installs a custom compaction filter.
1424    #[must_use]
1425    pub fn with_compaction_filter_factory(mut self, factory: Option<Arc<dyn Factory>>) -> Self {
1426        self.compaction_filter_factory = factory;
1427        self
1428    }
1429
1430    /// Sets the prefix extractor for prefix bloom filters.
1431    ///
1432    /// When configured, bloom filters will index key prefixes returned by
1433    /// the extractor. Prefix scans can then skip segments whose bloom
1434    /// filter reports no match for the scan prefix.
1435    #[must_use]
1436    pub fn prefix_extractor(mut self, extractor: Arc<dyn PrefixExtractor>) -> Self {
1437        self.prefix_extractor = Some(extractor);
1438        self
1439    }
1440
1441    /// Installs a merge operator for commutative operations.
1442    ///
1443    /// When set, enables [`crate::AbstractTree::merge`] which stores partial updates
1444    /// (operands) that are lazily combined during reads and compaction.
1445    #[must_use]
1446    pub fn with_merge_operator(mut self, op: Option<Arc<dyn MergeOperator>>) -> Self {
1447        self.merge_operator = op;
1448        self
1449    }
1450
1451    /// Sets a custom user key comparator.
1452    ///
1453    /// When configured, all key ordering (memtable, block index, merge,
1454    /// range scans) uses this comparator instead of the default lexicographic
1455    /// byte ordering.
1456    ///
1457    /// # Important
1458    ///
1459    /// The comparator's [`crate::UserComparator::name`] is persisted when a tree is
1460    /// first created. On subsequent opens the stored name is compared against
1461    /// the supplied comparator's name — a mismatch causes the open to fail
1462    /// with [`Error::ComparatorMismatch`](crate::Error::ComparatorMismatch).
1463    #[must_use]
1464    pub fn comparator(mut self, comparator: SharedComparator) -> Self {
1465        self.comparator = comparator;
1466        self
1467    }
1468
1469    /// Sets the block-level encryption provider for encryption at rest.
1470    ///
1471    /// When set, all blocks written to SST files are encrypted after
1472    /// compression and before checksumming, using the provided
1473    /// [`EncryptionProvider`].
1474    ///
1475    /// The caller is responsible for key management and rotation.
1476    /// See `crate::Aes256GcmProvider` (behind the `encryption` feature)
1477    /// for a ready-to-use AES-256-GCM implementation.
1478    ///
1479    /// **Important constraints:**
1480    /// - Encryption state is NOT recorded in SST metadata. Opening an
1481    ///   encrypted tree without the correct provider (or vice versa) will
1482    ///   cause block validation errors, not silent corruption.
1483    /// - Blob files (KV-separated large values) are NOT covered by
1484    ///   block-level encryption. Large values stored via KV separation
1485    ///   remain in plaintext on disk.
1486    #[must_use]
1487    pub fn with_encryption(mut self, encryption: Option<Arc<dyn EncryptionProvider>>) -> Self {
1488        self.encryption = encryption;
1489        self
1490    }
1491
1492    /// Sets the MANIFEST recovery policy for `Tree::open`.
1493    ///
1494    /// The default ([`ManifestRecoveryMode::AbsoluteConsistency`]) is the
1495    /// only choice that's safe for live production: any corrupt record
1496    /// in the on-disk manifest aborts the open. Switching to a more
1497    /// permissive mode trades strict correctness for partial
1498    /// availability after a disaster. The recovery path emits a
1499    /// `warn!` summary per affected section (aggregate counts: total
1500    /// table records dropped, total blob-file records dropped,
1501    /// header truncations) rather than one log line per dropped
1502    /// record — the dropped records were never decoded in the first
1503    /// place, so no per-record IDs are available. Always pair the
1504    /// non-default modes with an out-of-band integrity scan
1505    /// ([`verify_integrity`](crate::verify::verify_integrity) for
1506    /// whole-file XXH3 over every SST + blob file, or
1507    /// [`verify_block_checksums`](crate::verify::verify_block_checksums)
1508    /// for per-block granularity) before trusting the recovered tree
1509    /// for writes.
1510    ///
1511    /// See the [`ManifestRecoveryMode`] doc for per-variant semantics.
1512    #[must_use]
1513    pub fn manifest_recovery_mode(mut self, mode: ManifestRecoveryMode) -> Self {
1514        self.manifest_recovery_mode = mode;
1515        self
1516    }
1517
1518    /// Sets the durability level for every fsync the tree issues.
1519    ///
1520    /// Defaults to [`SyncMode::Normal`] (plain `fsync`, matching `RocksDB` /
1521    /// `SQLite` defaults). Pass [`SyncMode::Full`] to force `F_FULLFSYNC` on
1522    /// macOS for power-loss durability without an external journal — at a
1523    /// large per-flush cost. On non-macOS platforms both modes are
1524    /// identical (plain `fsync`).
1525    #[must_use]
1526    pub fn sync_mode(mut self, mode: SyncMode) -> Self {
1527        self.sync_mode = mode;
1528        self
1529    }
1530
1531    /// Sets the edit-log rotation threshold in bytes (default 1 MiB).
1532    ///
1533    /// Once the manifest edit log exceeds this size, the next version upgrade
1534    /// writes a fresh full snapshot and starts an empty log instead of appending
1535    /// another edit. Lower it to shorten recovery replay and cap log size at the
1536    /// cost of more frequent full-snapshot writes; `0` rotates on every upgrade.
1537    #[must_use]
1538    pub fn manifest_log_rotate_bytes(mut self, bytes: u64) -> Self {
1539        self.manifest_log_rotate_bytes = bytes;
1540        self
1541    }
1542
1543    /// Sets the compaction I/O rate limit in bytes per second.
1544    ///
1545    /// Caps how fast the compaction worker may issue I/O so background
1546    /// compaction does not saturate the device and spike user read P99.
1547    /// `0` (the default) disables throttling. Only compaction is limited;
1548    /// flush and user reads always pass through.
1549    #[must_use]
1550    pub fn compaction_rate_limit(mut self, bytes_per_sec: u64) -> Self {
1551        self.compaction_rate_limit = bytes_per_sec;
1552        self
1553    }
1554
1555    /// Sets the compaction worker-thread count.
1556    ///
1557    /// Under `std` this both sizes the per-tree block-compression pool built at
1558    /// open when no shared pool is supplied (see [`Self::compaction_pool`]) and
1559    /// caps how many range-parallel sub-compactions a compaction splits into.
1560    /// `1` keeps compaction serial. Default is `max(1, available_parallelism /
1561    /// 2)`. Without the `parallel` feature there is no built-in pool, so the
1562    /// work runs serially even for a value > 1.
1563    #[cfg(feature = "std")]
1564    #[must_use]
1565    pub fn compaction_threads(mut self, threads: usize) -> Self {
1566        // Clamp to >= 1: the documented semantics treat `1` as "serial", and a
1567        // 0-thread pool would be an invalid state.
1568        self.compaction_threads = threads.max(1);
1569        self
1570    }
1571
1572    /// Sets the minimum total input size (bytes) for a compaction to be split
1573    /// into parallel sub-compactions. Default 8 MiB. `0` splits every eligible
1574    /// compaction; a large value effectively disables sub-compaction (block
1575    /// compression still parallelizes via [`Self::compaction_threads`]).
1576    #[cfg(feature = "std")]
1577    #[must_use]
1578    pub fn subcompaction_min_bytes(mut self, bytes: u64) -> Self {
1579        self.subcompaction_min_bytes = bytes;
1580        self
1581    }
1582
1583    /// Supplies a shared compaction thread pool, used in place of the per-tree
1584    /// default. Pass one [`crate::table::writer::CompactionSpawner`] (e.g. a
1585    /// `RayonSpawner` wrapping a shared rayon thread pool) to several trees so
1586    /// the total worker-thread count stays bounded by the pool size rather than
1587    /// the number of open trees.
1588    #[cfg(feature = "std")]
1589    #[must_use]
1590    pub fn compaction_pool(
1591        mut self,
1592        pool: Option<Arc<dyn crate::table::writer::CompactionSpawner>>,
1593    ) -> Self {
1594        self.compaction_pool = pool;
1595        self
1596    }
1597
1598    /// Sets the pre-trained zstd dictionary for dictionary compression.
1599    ///
1600    /// When set, data blocks using [`CompressionType::ZstdDict`] will be
1601    /// compressed and decompressed with this dictionary. The dictionary
1602    /// should be trained on representative data samples for best results.
1603    ///
1604    /// Create a dictionary with [`ZstdDictionary::new`](crate::ZstdDictionary::new),
1605    /// then use [`CompressionType::zstd_dict`] to create a matching
1606    /// compression type:
1607    ///
1608    /// ```ignore
1609    /// use lsm_tree::{CompressionType, ZstdDictionary};
1610    ///
1611    /// let dict = ZstdDictionary::new(&training_data);
1612    /// let compression = CompressionType::zstd_dict(3, dict.id()).unwrap();
1613    ///
1614    /// config
1615    ///     .zstd_dictionary(Some(Arc::new(dict)))
1616    ///     .data_block_compression_policy(CompressionPolicy::all(compression));
1617    /// ```
1618    #[cfg(zstd_any)]
1619    #[must_use]
1620    pub fn zstd_dictionary(
1621        mut self,
1622        dictionary: Option<Arc<crate::compression::ZstdDictionary>>,
1623    ) -> Self {
1624        self.zstd_dictionary = dictionary;
1625        self
1626    }
1627}
1628
1629#[cfg(test)]
1630mod builder_tests;
lsm_tree/config/mod.rs

lsm_tree/config/
mod.rs